/*
This software module was originally developed by 
        Hang-Seop Lee (hslee@etri.re.kr), ETRI
        Jung-Chul Lee (jclee@etri.re.kr), ETRI
and edited by Hang-Seop Lee, Jung-Chul Lee of ETRI, 
in the course of development of the MPEG-4.
This software module is an implementation of a part of one or
more MPEG-4 tools as specified by the MPEG-4.
ISO/IEC gives users of the MPEG-4 free license to this                
software module or modifications thereof for use in hardware
or software products claiming conformance to the MPEG-4.
Those intending to use this software module in hardware or software
products are advised that its use may infringe existing patents.
The original developer of this software module and his/her company,
the subsequent editors and their companies, and ISO/IEC have no
liability for use of this software module or modifications thereof
in an implementation.
Copyright is not released for non MPEG-4 conforming
products. ETRI retains full right to use the code for his/her own
purpose, assign or donate the code to a third party and to
inhibit third parties from using the code for non
MPEG-4 conforming products.
This copyright notice must be included in all copies or
derivative works. Copyright (c) 1997.
*/    

#include "stdafx.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <malloc.h>
#include <sys/stat.h>
#include <ctype.h>
#include <fcntl.h>
#include <windows.h>

	/* For Bit stream */
char YY[2];

int G_Lang_v, G_Gender, G_Gender_v, G_Age, G_Age_v, G_Spch, G_Spch_v;
int G_Proso, G_Proso_d, G_Proso_F0, G_Proso_e, G_Video, G_Lip, G_Trick;

class ttsProsody {
 private:
 public:
    ttsProsody();
    ~ttsProsody();
    char durEnable;
    char f0ContourEnable;
    char energyContourEnable;
    int numberPhonemes;
    int phonemeSymbolsLength;
    char *phonemeSymbols;
    int *durEachPhoneme;
    int *f0ContourEachPhoneme;
    int *energyContourEachPhoneme;
};

ttsProsody::ttsProsody()
{
    numberPhonemes=0;
    phonemeSymbolsLength=0;
    phonemeSymbols=NULL;

    durEnable=0;
    durEachPhoneme=NULL;

    f0ContourEnable=0;
    f0ContourEachPhoneme=NULL;

    energyContourEnable=0;
    energyContourEachPhoneme=NULL;
}

class ttsSentence {
 private:
 public:
    ttsSentence(int i);
    ~ttsSentence();
    int tts_sentence_start_code;
    int ttsSentenceID;
    char silence;
    int silenceDuration;
    char gender;
    char age;
    char speechRate;
    int lengthText;
    char *ttsText;
    ttsProsody *prosody;
    float startTime;
    int sentenceDuration;
    int positionInSentence;
    int offset;
    int numberLipShape;
    int *LipShapeInSentence;
    int *lipShape;
};

ttsSentence::ttsSentence(int i)
{
    tts_sentence_start_code=0x012121212;
    ttsSentenceID=i;
    silence=0;
    silenceDuration=0;
    gender=0;
    age=0;
    speechRate=3;  /* r=[0.7 - 1.6], R=r*10 -3 : default r=1 */
    lengthText=0;
    ttsText=NULL;
    prosody=NULL;
    startTime=(float)0.;
    sentenceDuration=0;
    positionInSentence=0;
    offset=0;
    numberLipShape=0;
    LipShapeInSentence=NULL;
    lipShape=NULL;
}


class ttsSequence {
 private:
    int tts_sequence_start_code;
    int ttsSequenceID;
    char languageCode;
    char genderEnable;
    char ageEnable;
    char speechRateEnable;
    char prosodyEnable;
    char videoEnable;
    char lipShapeEnable;
    char trickModeEnable;
    ttsSentence *sentence;
    int itime;
    int iframe;
 public:
    ttsSequence(int i);
    ~ttsSequence();
    void init();
    void set_LangEn(int i) { languageCode=i; }
    void set_genderEn(int i) { genderEnable=i; }
    void set_ageEn(int i) { ageEnable=i; }
    void set_spRateEn(int i) { speechRateEnable=i; }
    void set_prosodyEn(int i) { prosodyEnable=i; }
    void set_videoEn(int i) { videoEnable=i; }
    void set_lipEn(int i) { lipShapeEnable=i; }
    void set_trickEn(int i) { trickModeEnable=i; }

    int get_LangEn() { return (int)languageCode; }
    int get_genderEn() { return (int)genderEnable; }
    int get_ageEn() { return (int)ageEnable; }
    int get_spRateEn() { return (int)speechRateEnable; }
    int get_prosodyEn() { return (int)prosodyEnable; }
    int get_videoEn() { return (int)videoEnable; }
    int get_lipEn() { return (int)lipShapeEnable; }
    int get_trickEn() { return (int)trickModeEnable; }

    void Sntinit();
    void AddSentence(ttsSentence *ttsSnt) { sentence=ttsSnt; }

    void set_gender(int i) { sentence->gender=i; }
    void set_age(int i) { sentence->age=i; }
    void set_spRate(int i) { sentence->speechRate=i; }
    void set_durEn(int i) { sentence->prosody->durEnable=i; };
    void set_f0En(int i) { sentence->prosody->f0ContourEnable=i; };
    void set_enEn(int i) { sentence->prosody->energyContourEnable=i; };

    int get_gender() { return (int)sentence->gender; }
    int get_age() { return (int)sentence->age; }
    int get_spRate() { return (int)sentence->speechRate; }
    int get_durEn() { return (int)sentence->prosody->durEnable; };
    int get_f0En() { return (int)sentence->prosody->f0ContourEnable; };
    int get_enEn() { return (int)sentence->prosody->energyContourEnable; };

    void put_text(char *Text);
    int get_text(char *Text) 
	{ strcpy(Text,sentence->ttsText); return (int)sentence->lengthText; }
    void put_prosody(int Nphone, short Sphone[][6],short Dur[],short F0[][3],short En[][3]);
    void put_position(int itime, float stime, int dur);
    void put_lip(FILE *fp);
    void save_ttsSequence(FILE *fout);
    void save_ttsSentence(FILE *fout);
    void clear_ttsSentence();
    void data_write(char *A, int nbits, int n, char *B, FILE *fout);
    void data_write(short *A, int nbits, int n, char *B, FILE *fout);
    void data_write(int *A, int nbits, int n, char *B, FILE *fout);
};

ttsSequence::ttsSequence(int i)
{
    tts_sequence_start_code=0x000000011;
    ttsSequenceID=i;
    languageCode=0;
    genderEnable=0;
    ageEnable=0;
    speechRateEnable=0;
    prosodyEnable=0;
    videoEnable=0;
    lipShapeEnable=0;
    trickModeEnable=0;
    sentence=NULL;
    itime=0;
    iframe=0;
}


void ttsSequence::init()
{
    set_LangEn(G_Lang_v);

    set_genderEn(G_Gender);

    set_ageEn(G_Age);

    set_spRateEn(G_Spch);

    set_prosodyEn(G_Proso);

    set_videoEn(G_Video);

    set_lipEn(G_Lip);

    set_trickEn(G_Trick);
}


void ttsSequence::Sntinit()
{
    if(genderEnable) {
    	set_gender(G_Gender_v);
	}

    if(ageEnable) {
    	set_age(G_Age_v);
	}

    if(speechRateEnable) {
    	set_spRate(G_Spch_v);
	}

    if(prosodyEnable) {
    	ttsProsody *prosody = new ttsProsody();
    	sentence->prosody=prosody;
    	set_durEn(G_Proso_d);
    	set_f0En(G_Proso_F0);
    	set_enEn(G_Proso_e);
	}
}

void ttsSequence::put_text(char *Text) 
{ 
    sentence->lengthText=strlen(Text); 
    sentence->ttsText=(char *)malloc(strlen(Text)+2);
    strcpy(sentence->ttsText,Text); 
}

void ttsSequence::put_prosody(int Nphone, short Sphone[][6],short Dur[],short F0[][3],short En[][3])
{ 
    int i, j, k; 
    ttsProsody *p=sentence->prosody;

    p->numberPhonemes=Nphone;
    for(i=k=0; i<Nphone; i++) 
	for(j=0; j<6; j++, k++) if(Sphone[i][j]==0) break;

    i=p->phonemeSymbolsLength=k*2;
    p->phonemeSymbols=(char *)malloc(i+2);
    for(i=k=0; i<Nphone; i++) {
	for(j=0; j<6; j++) {
	    if(Sphone[i][j]==0) break;
	    p->phonemeSymbols[k++]=(Sphone[i][j] >> 8) & 0xFF;
	    p->phonemeSymbols[k++]=(Sphone[i][j] & 0xFF);
	}   }

    if(get_durEn()) {
    	p->durEachPhoneme=(int *)malloc(Nphone*sizeof(int));
    	for(i=0; i<Nphone; i++) p->durEachPhoneme[i]=Dur[i];
	}

    if(get_f0En()) {
    	p->f0ContourEachPhoneme=(int *)malloc(Nphone*3*sizeof(int));
    	for(i=0; i<Nphone; i++) 
    	    for(j=0; j<3; j++) p->f0ContourEachPhoneme[i*3+j]=F0[i][j];
	}

    if(get_enEn()) {
        p->f0ContourEachPhoneme=(int *)malloc(Nphone*3*sizeof(int));
        for(i=0; i<Nphone; i++) 
    	    for(j=0; j<3; j++) p->f0ContourEachPhoneme[i*3+j]=F0[i][j];
	}

}

void ttsSequence::put_position(int t0, float stime, int dur)
{
    int i;

    sentence->startTime=stime;
    sentence->sentenceDuration=dur;
    sentence->positionInSentence=0;
    itime=t0;
    i=(int)(stime*1000); sentence->offset=i%itime;
}

void ttsSequence::save_ttsSequence(FILE *fout)
{
    int i; char c;

    data_write(&tts_sequence_start_code,32,1,YY,fout);
    data_write(&ttsSequenceID,5,1,YY,fout);
    data_write(&languageCode,10,1,YY,fout);
    data_write(&genderEnable,1,1,YY,fout);
    data_write(&ageEnable,1,1,YY,fout);
    data_write(&speechRateEnable,1,1,YY,fout);
    data_write(&prosodyEnable,1,1,YY,fout);
    data_write(&videoEnable,1,1,YY,fout);
    data_write(&lipShapeEnable,1,1,YY,fout);
    data_write(&trickModeEnable,1,1,YY,fout);
    if(YY[1]!=8) { c=0; i=YY[1]; data_write(&c,i,1,YY,fout); }
}

void ttsSequence::clear_ttsSentence()
{
    ttsSentence *s=sentence;
    ttsProsody *p=s->prosody;

    if(!s->silence) {
		free(s->ttsText); 
		s->ttsText=NULL;
		if(prosodyEnable) {
			free(p->phonemeSymbols); p->phonemeSymbols=NULL;
			if(get_durEn())
			{ free(p->durEachPhoneme); p->durEachPhoneme=NULL; }
			if(get_f0En())
			{ free(p->f0ContourEachPhoneme); p->f0ContourEachPhoneme=NULL; }
			if(get_enEn()) {
				free(p->energyContourEachPhoneme); 
				p->energyContourEachPhoneme=NULL;
			}
	    }
    	if(lipShapeEnable) {
	    free(s->LipShapeInSentence); s->LipShapeInSentence=NULL;
	    free(s->lipShape); s->lipShape=NULL;
		}   
	}
}


void ttsSequence::save_ttsSentence(FILE *fout)
{
    ttsSentence *s=sentence;
    ttsProsody *p=s->prosody;
    int i, j, Ts, Tn, N, nphone; char c;

    if(videoEnable) {
		Ts=(int)(s->startTime*1000);  Tn=iframe*itime;
		N=(Ts-Tn)/itime; 
		for(i=0; i<N; i++) {
			data_write(&s->tts_sentence_start_code,32,1,YY,fout);
			data_write(&s->ttsSentenceID,10,1,YY,fout);
			c=1; data_write(&c,1,1,YY,fout);
			data_write(&itime,12,1,YY,fout);
			if(YY[1]!=8) { c=0; j=YY[1]; data_write(&c,j,1,YY,fout); }
	    } iframe+=N;
		Tn=s->offset+s->sentenceDuration; N=Tn/itime;
		if((Tn%itime)!=0) N++;
	} 
    else N=1;

    for(i=0; i<N; i++) {	
    	data_write(&s->tts_sentence_start_code,32,1,YY,fout);
    	data_write(&s->ttsSentenceID,10,1,YY,fout);
    	data_write(&s->silence,1,1,YY,fout);
    	if(s->silence) data_write(&s->silenceDuration,12,1,YY,fout);
    	else {
			if(genderEnable) data_write(&s->gender,1,1,YY,fout);
			if(ageEnable) data_write(&s->age,3,1,YY,fout);
			if(!videoEnable && speechRateEnable)
				data_write(&s->speechRate,4,1,YY,fout);
    	    data_write(&s->lengthText,12,1,YY,fout);
    	    data_write(s->ttsText,8,s->lengthText,YY,fout);
			if(prosodyEnable) {
				data_write(&p->durEnable,1,1,YY,fout);
				data_write(&p->f0ContourEnable,1,1,YY,fout);
				data_write(&p->energyContourEnable,1,1,YY,fout);
				nphone=p->numberPhonemes;
				data_write(&nphone,10,1,YY,fout);
				data_write(&p->phonemeSymbolsLength,13,1,YY,fout);
				data_write(p->phonemeSymbols,8,p->phonemeSymbolsLength,YY,fout);
				if(get_durEn())
	    			data_write(p->durEachPhoneme,12,nphone,YY,fout);
				if(get_f0En())
	    			data_write(p->f0ContourEachPhoneme,8,nphone*3,YY,fout);
				if(get_enEn())
	    			data_write(p->energyContourEachPhoneme,8,nphone*3,YY,fout);
	        }
    	    if(videoEnable) {
				data_write(&s->sentenceDuration,16,1,YY,fout);
				data_write(&s->positionInSentence,16,1,YY,fout);
				data_write(&s->offset,10,1,YY,fout);
				if(s->offset) s->positionInSentence+=(itime-s->offset);
				else s->positionInSentence+=itime;
				s->offset=0;
	        }
    	    if(lipShapeEnable) {
				j=s->numberLipShape;
				data_write(&j,10,1,YY,fout);
				data_write(s->LipShapeInSentence,16,j,YY,fout);
				data_write(s->lipShape,8,j,YY,fout);
		    }   
		}
        if(YY[1]!=8) { c=0; j=YY[1]; data_write(&c,j,1,YY,fout); }
	} 
    if(videoEnable) iframe+=N;
}

void ttsSequence::data_write(char *A, int nbits, int n, char *B, FILE *fout)
{
    int i, nx, ny;
    unsigned char X,Y;
    short W[9]={ 0,1,3,7,0x0F,0x01F,0x03F,0x07F,0x0FF };

    Y=B[0]; ny=B[1];
    for(i=0; i<n; i++) {
        nx=nbits;
        while(nx!=0) {
            if(ny>nx) { X=*(A+i) & W[nx]; Y=Y | (X << (ny-nx)); ny-=nx; nx=0; }
            else {
                X=(*(A+i)) >> (nx-ny); Y=Y | (X & W[ny]); nx-=ny; ny=8;
                fwrite(&Y,1,1,fout); Y=0;
            }   }
        } B[0]=Y; B[1]=ny;
}

void ttsSequence::data_write(short *A, int nbits, int n, char *B, FILE *fout)
{
    int i, nx, ny;
    unsigned short X; unsigned char Y;
    short W[9]={ 0,1,3,7,0x0F,0x01F,0x03F,0x07F,0x0FF };

    Y=B[0]; ny=B[1];
    for(i=0; i<n; i++) {
        nx=nbits;
        while(nx!=0) {
            if(ny>nx) { X=*(A+i) & W[nx]; Y=Y | (X << (ny-nx)); ny-=nx; nx=0; }
            else {
                X=(*(A+i)) >> (nx-ny); Y=Y | (X & W[ny]); nx-=ny; ny=8;
                fwrite(&Y,1,1,fout); Y=0;
            }   }
        } B[0]=Y; B[1]=ny;
}

void ttsSequence::data_write(int *A, int nbits, int n, char *B, FILE *fout)
{
    int i, nx, ny;
    unsigned int X; unsigned char Y;
    short W[9]={ 0,1,3,7,0x0F,0x01F,0x03F,0x07F,0x0FF };

    Y=B[0]; ny=B[1];
    for(i=0; i<n; i++) {
        nx=nbits;
        while(nx!=0) {
            if(ny>nx) { X=*(A+i) & W[nx]; Y=Y | (X << (ny-nx)); ny-=nx; nx=0; }
            else {
                X=(*(A+i)) >> (nx-ny); Y=Y | (X & W[ny]); nx-=ny; ny=8;
                fwrite(&Y,1,1,fout); Y=0;
            }
		}
    } B[0]=Y; B[1]=ny;
}

int rd_txt(FILE *fp, char *Text);
int read_prosody(FILE *fp,short Sphon[][6],short Dur[],short F0[][3],short En[][3]);
void read_video(FILE *fp,float *stime,int *dur);

int Niframe, v_st, v_ed, sp_st, sp_ed;
float Itime;

void Encoder(int E_Lang_v, int E_Gender, int E_Gender_v, int E_Age, int E_Age_v, int E_Spch, int E_Spch_v, int E_Proso, int E_Proso_d, int E_Proso_F0, int E_Proso_e, int E_Video, int E_Lip, int E_Trick )
{
    int i, itime, Nphone, dur;  
    short Dur[800], F0[800][3], En[800][3];  
    float stime;
    char fname[30],Text[800];
	short Sphone[800][6];
    FILE *ftext, *fvideo, *fprosody, *fout; 

	G_Lang_v = E_Lang_v; G_Gender = E_Gender; G_Gender_v = E_Gender_v;
	G_Age = E_Age; G_Age_v = E_Age_v;  G_Spch = E_Spch; G_Spch_v=E_Spch_v; 
	G_Proso =  E_Proso;  G_Proso_d = E_Proso_d; G_Proso_F0 = E_Proso_F0; G_Proso_e = E_Proso_e; 
	G_Video = E_Video; G_Lip = E_Lip; G_Trick = E_Trick;

	/* We assume time duration between adjacent I-frames is 0.5sec */
    itime=500;  /* msec */

    i=0;
    ttsSequence *ttsSeq = new ttsSequence(1);

    ttsSeq->init();

    ttsSentence *ttsSnt = new ttsSentence(++i);
    ttsSeq->AddSentence(ttsSnt);
    ttsSeq->Sntinit();

    YY[0]=0; YY[1]=8;

	/* Check Data files */

    if((ftext=fopen("ko_text.dat","r"))==NULL)
	{ ::MessageBox(NULL, "Error: ko_text.dat open", "ERROR", MB_OK); exit(1); }

    if(ttsSeq->get_prosodyEn()) {
		if(ttsSeq->get_gender()) strcpy(fname,"ko_m-prosody.dat");
		else strcpy(fname,"ko_f-prosody.dat");
    	if((fprosody=fopen(fname,"r"))==NULL)
		{ ::MessageBox(NULL, "Error: prosody data open", "ERROR", MB_OK); exit(1); }
	}
    if(ttsSeq->get_videoEn()) {
		if(ttsSeq->get_gender()) strcpy(fname,"ko_m-video.dat");
		else strcpy(fname,"ko_f-video.dat");
    	if((fvideo=fopen(fname,"r"))==NULL)
		{ ::MessageBox(NULL, "Err: video data open", "ERROR", MB_OK); exit(1); }
	}

    fout=fopen("mpeg_tts.dat","w+b");
    ttsSeq->save_ttsSequence(fout);
    while( (i=rd_txt(ftext,Text)) > 1 ) {
		ttsSeq->put_text(Text);
		i=ttsSeq->get_text(Text); //printf("%d %s\n",i,Text);

    	if(ttsSeq->get_prosodyEn()) {
			Nphone=read_prosody(fprosody,Sphone,Dur,F0,En);
			if( Dur[0] == -1000 )	ttsSeq->set_durEn(0);
			if( F0[0][0] == -1000 )	ttsSeq->set_f0En(0);
			if( En[0][0] == -1000 )	ttsSeq->set_enEn(0);
			ttsSeq->put_prosody(Nphone,Sphone,Dur,F0,En);
	    }

    	if(ttsSeq->get_videoEn()) {
			read_video(fvideo,&stime,&dur);
			ttsSeq->put_position(itime,stime,dur);
	    }

    	ttsSeq->save_ttsSentence(fout);
		ttsSeq->clear_ttsSentence();
	} 
	fclose(ftext); fclose(fout);

}

int rd_txt(FILE *fp, char *Text)
{
    int sentnce_end=0, i=0, ch; char pchr;

    pchr=32;
    while( (ch=fgetc(fp)) != EOF && i< 800 ) {
        if(ch=='\n' || ch=='\r' || ch == '\0' || ch =='\t') ch=32;
        if(ch=='"' || ch=='\'' || ch=='{' || ch=='}' || ch=='[' || ch==']' ||
           ch=='<' || ch=='>' || ch=='(' || ch==')' || ch==',' ) continue;
        if(ch != 32 || pchr != 32) Text[i++]=ch;
        if(ch=='.') {
            ch=fgetc(fp); if(!isdigit(ch)) { sentnce_end=1; break; }
            Text[i++]=ch;
            }
        pchr=ch; if( ch=='?' || ch=='!') { sentnce_end=1; break; }
        }
    if(sentnce_end == 0) { if(Text[i-1]==32) i--;  Text[i++]='.'; }
    Text[i]=0; return(i);
}

int read_prosody(FILE *fp,short Sphon[][6],short Dur[],short F0[][3],short En[][3])
{
	int i, j, k, m[10], n=0, np, nIPA;
    char line[100], str[15][20], *tokn;
    char *field[]={ "IPA_phones","dur","F0_1","F0_2","F0_3","En_1","En_2","En_3" };

    for(i=0; i<8; i++) m[i]=-1;
    fgets(line,90,fp);
    for(i=0, tokn=strtok(line," \n"); tokn!=NULL; tokn=strtok(NULL," \n")) 
	strcpy(str[i++],tokn);
    np=i-1;
    if(strcmp(str[0],"begin")!=0) { ::MessageBox(NULL, "prosody file err", "Error", MB_OK); exit(1); }
    for(j=1; j<i; j++) {
	for(k=0; k<8; k++) 
	    if(strcmp(str[j],field[k])==0) { m[k]=j-1; break; }
	}

    while(fgets(line,90,fp)!=NULL) {
    	for(i=0, tokn=strtok(line," \n"); tokn!=NULL; tokn=strtok(NULL," \n")) 
	    strcpy(str[i++],tokn);
		if(strcmp(str[0],"end")==0) break;
		nIPA=i-(np-1);
		for(j=0; j<nIPA; j++) {
			sscanf(str[m[0]+j],"%x",&k); Sphon[n][j]=k;
			}
		Sphon[n][j]=0;
		if(m[1]!=-1) Dur[n]=(int)(atof(str[m[1]+nIPA-1])*1000); 
		else Dur[0] = -1000;
		for(i=2; i<5; i++) {
			if(m[i]!=-1) F0[n][i-2]=atoi(str[m[i]+nIPA-1])/2; 
			else { F0[0][0] = -1000; break; }
		}
		for(i=5; i<8; i++) {
			if(m[i]!=-1) En[n][i-5]=atoi(str[m[i]+nIPA-1]); 
			else { En[0][0] = -1000; break;	}
		}
		n++;
	}
    return(n);

}

void read_video(FILE *fp,float *stime,int *dur)
{
    int i; float t;
    char line[100], str[10][10], *tokn;
    
    fgets(line,90,fp);
    for(i=0, tokn=strtok(line," \n"); tokn!=NULL; tokn=strtok(NULL," \n")) 
	strcpy(str[i++],tokn);
    if(strcmp(str[0],"begin")!=0) { ::MessageBox(NULL, "video file err", "Error", MB_OK); exit(1); }
    fgets(line,90,fp);  tokn=strtok(line," \n"); *stime=(float)atof(tokn);
    fgets(line,90,fp);  tokn=strtok(line," \n"); t=(float)atof(tokn);
    *dur=(int)((t-*stime)*1000);
    fgets(line,90,fp);  tokn=strtok(line," \n");
    if(strcmp(tokn,"end")!=0) { ::MessageBox(NULL, "video file err", "Error", MB_OK); exit(1); }
}
